{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- [OneHotEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)\n",
    "- [DictVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html#sklearn.feature_extraction.DictVectorizer)\n",
    "- [FeatureHasher](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html#sklearn.feature_extraction.FeatureHasher)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.feature_extraction import DictVectorizer,FeatureHasher\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 0 3]\n",
      " [1 1 0]\n",
      " [0 2 1]\n",
      " [1 0 2]]\n",
      "[2 3 4]\n"
     ]
    }
   ],
   "source": [
    "enc = OneHotEncoder()\n",
    "enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2],[1,1,1]]) \n",
    "a=np.array([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n",
    "print(a)\n",
    "# 基于列进行编码\n",
    "print(enc.n_values_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 把0放到第一列[0,1]比对，把1放到第二列[0,1,2]比对，把3放到第三列[0,1,2，3]比对\n",
    "enc.transform([[0, 1, 3]]).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 2.  0.  1.]\n",
      " [ 0.  2.  3.]]\n",
      "['bar', 'baz', 'foo']\n",
      "[[ 0.  0.  4.]]\n"
     ]
    }
   ],
   "source": [
    "# 将特征值映射列表转换为向量\n",
    "dv = DictVectorizer(sparse=False)\n",
    "D = [{'foo':1, 'bar':2}, {'foo':3, 'baz': 2}]\n",
    "X = dv.fit_transform(D)\n",
    "print (X)\n",
    "print (dv.get_feature_names())\n",
    "print (dv.transform({'foo':4, 'unseen':3}))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  4.,  1.,  0.,  0.,  0.,  0.,  0.,  2.],\n",
       "       [ 0.,  0.,  0.,  2.,  5.,  0.,  0.,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# hash转换\n",
    "h = FeatureHasher(n_features=10,non_negative=True)\n",
    "D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]\n",
    "f = h.transform(D)\n",
    "f.toarray()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
